import time

from pyspark import SparkConf, SparkContext, StorageLevel

if __name__ == '__main__':
    # 构建SparkConf对象
    conf = SparkConf().setAppName("test").setMaster("local[*]")
    # 构建SparkContext执行环境入口对象
    sc = SparkContext(conf=conf)

    # 告知spark开启 checkpoint功能，并指定存储位置
    sc.setCheckpointDir("hdfs://node1:8020/output/checkpoint")

    rdd = sc.parallelize([1, 2, 3, 4, 5, 6])

    rdd2 = rdd.map(lambda x: x + 1)

    # rdd2 缓存
    rdd2.checkpoint()

    rdd3 = rdd2.map(lambda x: x * 2)
    print(rdd3.collect())

    rdd4 = rdd2.map(lambda x: x * 4)

    print(rdd4.collect())

    time.sleep(1000000)

    # rdd2 清除缓存
    rdd2.unpersist()